Working Directory: /Users/jamesdiao/Documents/Kohane_Lab/2016-paper-ACMG-penetrance/ACMG_Penetrance
\newpagehttp://www.ncbi.nlm.nih.gov/clinvar/docs/acmg/
ACMG.panel <- ACMG.table[,"Gene_Name"] %>% unique
cat(sprintf("Processed Table from ACMG Website %s x %s (selected rows):", nrow(ACMG.table), ncol(ACMG.table)))
## Processed Table from ACMG Website 64 x 4 (selected rows):
row.names(ACMG.table) <- paste0("N",1:nrow(ACMG.table))
ACMG.table[-c(3:4,6:9,15,19,21:27,29,33:36,38:39,41:43,45,47,49:50,54:56,62),] %>% pander
| Disease_Name | Disease_MIM | Gene_Name | Gene_MIM | |
|---|---|---|---|---|
| N1 | Adenomatous polyposis coli | 175100 | APC | 611731 |
| N2 | Aortic aneurysm, familial thoracic 4 | 132900 | MYH11 | 160745 |
| N5 | Arrhythmogenic right ventricular cardiomyopathy, type 5 | 604400 | TMEM43 | 612048 |
| N10 | Breast-ovarian cancer, familial 1 | 604370 | BRCA1 | 113705 |
| N11 | Breast-ovarian cancer, familial 2 | 612555 | BRCA2 | 600185 |
| N12 | Brugada syndrome 1 | 601144 | SCN5A | 600163 |
| N13 | Catecholaminergic polymorphic ventricular tachycardia | 604772 | RYR2 | 180902 |
| N14 | Dilated cardiomyopathy 1A | 115200 | LMNA | 150330 |
| N16 | Ehlers-Danlos syndrome, type 4 | 130050 | COL3A1 | 120180 |
| N17 | Fabry’s disease | 301500 | GLA | 300644 |
| N18 | Familial hypercholesterolemia | 143890 | APOB | 107730 |
| N20 | Familial hypertrophic cardiomyopathy 1 | 192600 | MYH7 | 160760 |
| N28 | Familial medullary thyroid carcinoma | 155240 | RET | 164761 |
| N30 | Left ventricular noncompaction 6 | 601494 | TNNT2 | 191045 |
| N31 | Li-Fraumeni syndrome 1 | 151623 | TP53 | 191170 |
| N32 | Loeys-Dietz syndrome type 1A | 609192 | TGFBR1 | 190181 |
| N37 | Long QT syndrome 1 | 192500 | KCNQ1 | 607542 |
| N40 | Lynch syndrome | 120435 | MLH1 | 120436 |
| N44 | Malignant hyperthermia | 145600 | RYR1 | 180901 |
| N46 | Marfan’s syndrome | 154700 | FBN1 | 134797 |
| N48 | Multiple endocrine neoplasia, type 1 | 131100 | MEN1 | 613733 |
| N51 | MYH-associated polyposis | 608456 | MUTYH | 604933 |
| N52 | Neurofibromatosis, type 2 | 101000 | NF2 | 607379 |
| N53 | Paragangliomas 1 | 168000 | SDHD | 602690 |
| N57 | Peutz-Jeghers syndrome | 175200 | STK11 | 602216 |
| N58 | Pilomatrixoma | 132600 | MUTYH | 604933 |
| N59 | PTEN hamartoma tumor syndrome | 153480 | PTEN | 601728 |
| N60 | Retinoblastoma | 180200 | RB1 | 614041 |
| N61 | Tuberous sclerosis 1 | 191100 | TSC1 | 605284 |
| N63 | Von Hippel-Lindau syndrome | 193300 | VHL | 608537 |
| N64 | Wilms’ tumor | 194070 | WT1 | 607102 |
cat("ACMG-56 Genes:")
## ACMG-56 Genes:
print(ACMG.panel, quote = F)
## [1] APC MYH11 ACTA2 MYLK TMEM43 DSP PKP2 DSG2
## [9] DSC2 BRCA1 BRCA2 SCN5A RYR2 LMNA MYBPC3 COL3A1
## [17] GLA APOB LDLR MYH7 TPM1 PRKAG2 TNNI3 MYL3
## [25] MYL2 ACTC1 RET PCSK9 TNNT2 TP53 TGFBR1 TGFBR2
## [33] SMAD3 KCNQ1 KCNH2 MLH1 MSH2 MSH6 PMS2 RYR1
## [41] CACNA1S FBN1 MEN1 MUTYH NF2 SDHD SDHAF2 SDHC
## [49] SDHB STK11 PTEN RB1 TSC1 TSC2 VHL WT1
ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz
ClinVar is the central repository for variant interpretations. Relevant information from the VCF includes:
(a) CLNSIG = “Variant Clinical Significance, 0 - Uncertain, 1 - Not provided, 2 - Benign, 3 - Likely benign,
4 - Likely pathogenic, 5 - Pathogenic, 6 - Drug response, 7 - Histocompatibility, 255 - Other”
(b) CLNDBN = “Variant disease name”
(c) CLNDSDBID = “Variant disease database ID”
(d) INTERP = Pathogenicity (likely pathogenic or pathogenic; CLNSIG = 4 or 5)
#Number to interpretation
clnsig_map <- c(0:7,255,-1) %>% setNames(c("Uncertain",
"Not_Provided","Benign","Likely_Benign","Likely_Pathogenic",
"Pathogenic","Drug_Response","Histocompatibility","Other","NA"))
get_clinvar <- function(clinvar_file) {
file.by.line <- readLines(clinvar_file)
#file_date <- as.Date(strsplit(file.by.line[2],"=")[[1]][2], "%Y%m%d")
#system(sprintf("mv %s ClinVar_Reports/clinvar_%s.vcf", clinvar_file, file_date))
clean.lines <- file.by.line[!grepl("##.*", file.by.line)] #Remove ## comments
clean.lines[1] <- sub('.', '', clean.lines[1]) #Remove # from header
input <- read.table(text = paste(clean.lines, collapse = "\n"), header = T, stringsAsFactors = F,
comment.char = "", quote = "", sep = "\t")
input <- input[nchar(input$REF)==1,] #deletions
alt_num <- sapply(strsplit(input$ALT,","),length) #number of alts
acceptable_nchar <- 2*alt_num-1 #adds in the length from commas, if each alt is 1 nt.
input <- input[nchar(input$ALT)==acceptable_nchar,] #insertions
input$ALT <- strsplit(input$ALT,",")
split_all <- strsplit(input$INFO,";")
has.clndsdbid <- any(grepl("CLNDSDBID", split_all))
split_info <- function(name) {
sapply(split_all, function(entry) {
entry[grep(name,entry)]
}) %>% strsplit("=") %>% sapply(function(x) x[2]) %>% strsplit(",")
}
input$CLNALLE <- split_info("CLNALLE=") %>% sapply(as.integer)
input$CLNSIG <- split_info("CLNSIG=")
input$CLNDBN <- split_info("CLNDBN=")
if (has.clndsdbid)
input$CLNDSDBID <- split_info("CLNDSDBID=")
#CLNALLE has 0,-1,3,4 --> CLNSIG has 1,2,3,4 --> ALT has 1.
taking <- sapply(input$CLNALLE, function(x) x[x>0] ) #Actual elements > 0. Keep these in CLNSIG and ALT
taking_loc <- sapply(input$CLNALLE, function(x) which(x>0) )#Tracks locations for keeping in CLNALLE
keep <- sapply(taking, length)>0 #reduce everything to get rid of 0 and -1
# Reduce, reduce, reduce.
taking <- taking[keep]
taking_loc <- taking_loc[keep]
input <- input[keep,]
#Make this more readable
input$ALT <- sapply(1:nrow(input), function(row) {
input$ALT[[row]][taking[[row]]]
})
col_subset <- function(name) {
sapply(1:nrow(input), function(row) {
unlist(input[row,name])[taking_loc[[row]]]
})
}
input$CLNSIG <- col_subset("CLNSIG")
input$CLNALLE <- col_subset("CLNALLE")
input$CLNDBN <- col_subset("CLNDBN")
if (has.clndsdbid)
input$CLNDSDBID <- col_subset("CLNDSDBID")
filter_condition <- input[,unlist(lapply(input, typeof))=="list"] %>%
apply(1,function(row) !any(is.na(row)))
input <- input %>% filter(filter_condition) %>%
unnest %>% unite(VAR_ID, CHROM, POS, REF, ALT, sep = "_", remove = F) %>%
select(VAR_ID, CHROM, POS, ID, REF, ALT, CLNALLE, CLNSIG, everything()) %>%
mutate(CLNSIG = strsplit(CLNSIG,"|",fixed = T)) %>%
mutate(CLNDBN = strsplit(CLNDBN,"|",fixed = T)) %>%
mutate(POS = as.integer(POS))
if (has.clndsdbid)
input <- input %>% mutate(CLNDSDBID = strsplit(CLNDSDBID,"|",fixed = T))
input$CLNSIG <- sapply(input$CLNSIG, function(x) as.integer(x))
input$INTERP <- sapply(input$CLNSIG, function(x) any(x %in% c(4,5)) )
input
}
clinvar <- get_clinvar(clinvar_file)
cat(sprintf("Processed ClinVar data frame %s x %s (selected rows/columns):", nrow(clinvar), ncol(clinvar)))
## Processed ClinVar data frame 126349 x 14 (selected rows/columns):
clinvar[1:6,] %>% select(-CLNALLE, -INFO, -QUAL, -FILTER) %>% remove_rownames %>% pander
| VAR_ID | CHROM | POS | ID | REF | ALT | CLNSIG |
|---|---|---|---|---|---|---|
| 1_949523_C_T | 1 | 949523 | rs786201005 | C | T | 5 |
| 1_949739_G_T | 1 | 949739 | rs672601312 | G | T | 5 |
| 1_955597_G_T | 1 | 955597 | rs115173026 | G | T | 2 |
| 1_955619_G_C | 1 | 955619 | rs201073369 | G | C | 255 |
| 1_957568_A_G | 1 | 957568 | rs115704555 | A | G | 2 |
| 1_957605_G_A | 1 | 957605 | rs756623659 | G | A | 5 |
Table continues below
| CLNDBN | CLNDSDBID | INTERP |
|---|---|---|
| Immunodeficiency_38_with_basal_ganglia_calcification | CN221808:616126 | TRUE |
| Immunodeficiency_38_with_basal_ganglia_calcification | CN221808:616126 | TRUE |
| not_specified | CN169374 | FALSE |
| not_specified | CN169374 | FALSE |
| not_specified | CN169374 | FALSE |
| Congenital_myasthenic_syndrome | C0751882:ORPHA590 | TRUE |
ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.[chrom].phase3_[version].20130502.genotypes.vcf.gz
Downloaded 1000 Genomes VCFs are saved in: /Users/jamesdiao/Documents/Kohane_Lab/2016-paper-ACMG-penetrance/1000G/
download_1000g <- function(gene, download) {
#for tracking: #gene %>% paste(which(ACMG.panel==gene)) %>% paste(length(ACMG.panel), sep = "/") %>% print
success <- FALSE
refGene <- sprintf("select * from refGene where name2 = \"%s\" limit 20", gene) %>% query
UCSC <- select(refGene, name, chrom, start = txStart, end = txEnd)
if (nrow(UCSC) == 0) { #No hit on refGene
return(rep("NOT_FOUND",5) %>% setNames(c("name","chrom","start","end","downloaded")))
} else {
if (nrow(UCSC) > 1) #Multiple hits: take the widest range
UCSC <- UCSC[which.max(UCSC$end-UCSC$start),]
if (download) {
# gets [n] from chr[n]
chrom.num <- strsplit(UCSC$chrom, split = "chr")[[1]][2]
# different version for chromosomes X and Y
version <- switch(chrom.num, "X" = "shapeit2_mvncall_integrated_v1b",
"Y" = "integrated_v2a", "shapeit2_mvncall_integrated_v5a")
command <- paste("tabix -h ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.%s.",
"phase3_%s.20130502.genotypes.vcf.gz %s:%s-%s > %s_genotypes.vcf", sep = "")
sprintf(command, UCSC$chrom, version, chrom.num, UCSC$start, UCSC$end, gene) %>% system
Sys.sleep(2)
# Checks whether the file exists and has non-zero size
exists <- grepl(paste(gene,"_genotypes.vcf",sep =""), system("ls", intern = T)) %>% sum > 0
file.size <- strsplit(paste("stat ","_genotypes.vcf", sep = gene) %>%
system(intern = T), " ")[[1]][8]
success <- exists & file.size > 0
}
}
return(c(UCSC,"downloaded" = success))
}
if (!skip_download & !skip_processing) {
system("mkdir 1000G")
setwd(paste(getwd(), "1000G", sep = "/"))
for (con in dbListConnections(MySQL())) dbDisconnect(con)
con <- dbConnect(MySQL(), user = 'genome',
dbname = 'hg19', host = 'genome-mysql.cse.ucsc.edu',
unix.sock = "/Applications/MAMP/tmp/mysql/mysql.sock")
query <- function (input) { suppressWarnings(dbGetQuery(con, input)) }
download_output <- sapply(ACMG.panel, function(gene) download_1000g(gene, download = T)) %>% t
print(download_output, quote = F)
download_output <- download_output %>%
apply(2, unlist) %>%
as.data.frame(stringsAsFactors = F) %>%
mutate("gene" = rownames(download_output)) %>%
select(gene, everything()) %>%
filter(downloaded != "NOT_FOUND")
download_output <- download_output %>%
mutate(chrom = sapply(strsplit(download_output$chrom,"chr"), function(x) x[2]),
start = as.integer(start), end = as.integer(end),
downloaded = as.logical(downloaded))
write.table(download_output, file = "download_output.txt",
row.names = F, col.names = T, quote = F, sep = "\t")
system("rm *.genotypes.vcf.gz.tbi")
} else {
if (skip_download)
download_output <- read.table("1000G/download_output.txt", header = T, stringsAsFactors = F)
else
download_output <- read.table("Supplementary_Files/download_output.txt", header = T, stringsAsFactors = F)
}
cat(sprintf("Download report: region and successes: %s x %s (selected rows):", nrow(download_output), ncol(download_output)))
## Download report: region and successes: 56 x 6 (selected rows):
download_output[1:5,] %>% format(scientific = F) %>% pander
| gene | name | chrom | start | end | downloaded |
|---|---|---|---|---|---|
| APC | NM_001127511 | 5 | 112043201 | 112181936 | TRUE |
| MYH11 | NM_001040113 | 16 | 15796991 | 15950887 | TRUE |
| ACTA2 | NM_001141945 | 10 | 90694830 | 90751154 | TRUE |
| MYLK | NM_001321309 | 3 | 123331142 | 123603149 | TRUE |
| TMEM43 | NM_024334 | 3 | 14166439 | 14185180 | TRUE |
cat("File saved as download_output.txt in Supplementary_Files")
## File saved as download_output.txt in Supplementary_Files
This will allow us to assign genotypes from the 1000 Genomes VCF to ancestral groups.
From: ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel
#read the map and delete the file
map <- read.table(file = "Supplementary_Files/phase3map.txt", stringsAsFactors = F, header = T) %>% as.data.frame
#display
cat("Phase 3 Populations Map Table: 2504 x 4 (selected rows)")
## Phase 3 Populations Map Table: 2504 x 4 (selected rows)
map[sample(nrow(map),10),] %>% arrange(super_pop) %>% remove_rownames %>% pander
| sample | pop | super_pop | gender |
|---|---|---|---|
| HG02554 | ACB | AFR | male |
| HG02292 | PEL | AMR | female |
| NA19684 | MXL | AMR | female |
| NA18945 | JPT | EAS | male |
| NA19086 | JPT | EAS | male |
| HG02180 | CDX | EAS | female |
| HG00266 | FIN | EUR | female |
| HG01537 | IBS | EUR | female |
| HG03019 | PJL | SAS | female |
| NA20881 | GIH | SAS | female |
#Make list of populations and superpopulations for later plotting
pop.table <- map[!duplicated(map$pop),] %>%
select(contains("pop")) %>% arrange(super_pop, pop)
super <- pop.table$super_pop %>% setNames(pop.table$pop)
super.levels <- unique(pop.table$super_pop)
pop.levels <- unique(pop.table$pop)
#Plot distribution of ancestral backgrounds
Population = factor(as.character(map$pop), levels = pop.levels)
cat("Population Distribution")
## Population Distribution
ggplot(map, aes(map$super_pop, fill = Population)) +
geom_bar(color = 'black', width = 0.5) +
ylab ("No. of Individuals") + xlab ("Superpopulation") +
ggtitle("1000 Genomes - Samples by Population")
rm(Population)
import.file.1000g <- function(gene) {
sprintf("%s [%s/%s]", gene, grep(gene, ACMG.panel), length(ACMG.panel)) %>% cat
name <- paste("1000G",paste(gene,"genotypes.vcf", sep = "_"), sep = "/")
output <- read.table(paste(getwd(),name,sep="/"), stringsAsFactors = FALSE)
#Add header
names(output)[1:length(header)] <- header
#Remove all single alt indels
output <- output[nchar(output$REF)==1,] #deletions
alt_num <- sapply(strsplit(output$ALT,","),length) #number of alts
acceptable_nchar <- 2*alt_num-1 #adds in the length from commas, if each alt is 1 nt.
output <- output[nchar(output$ALT)==acceptable_nchar,] #insertions
alt_num <- sapply(strsplit(output$ALT,","),length) #recalculate
paired = which(alt_num!=1) #all with ,
#Add AF Column
af <- strsplit(output$INFO,";") %>% sapply("[", 2) %>%
strsplit("AF=") %>% sapply("[", 2) %>% strsplit(",") %>% sapply(as.numeric)
output <- cbind(GENE = gene, "AF_1000G"=I(af), output) #Places it at the front of output
front_cols <- 1:(grep("HG00096",colnames(output))-1)
if (length(paired)!=0) {
#Limit max vector length by sapply(strsplit(output$ALT,","),length)
sapply(paired, function(rownum) { #For every row
sapply(as.character(1:alt_num[rownum]), function(num) {
grepl(paste(num,"|",sep = ""), output[rownum,-front_cols], fixed=T) +
grepl(paste("|",num,sep = ""), output[rownum,-front_cols], fixed=T)
}) %>% t -> temp
split(temp, rep(1:ncol(temp), each = nrow(temp))) %>% setNames(NULL)
#Separate into list of vectors (1 entry for counting each ALT)
}) %>% t -> insert
insert <- cbind(output[paired,front_cols],insert)
colnames(insert) <- colnames(output)
insert <- insert %>% #adds front_col info
mutate(ALT = strsplit(ALT,",")) %>% #Splits ALTS
unnest() %>% #Unnests everything
select(GENE, AF_1000G, CHROM, POS, ID, REF, ALT, everything()) #Reorders everything
output <- output[-paired,] #Removes paired
}
output <- cbind(output[,front_cols],
apply(output[,-front_cols], 2, function(y) {
grepl("1|", y, fixed=T) +
grepl("|1", y, fixed=T)
}) ) #convert to logical
if (length(paired)!=0)
output <- rbind(output, insert) #joins the two
output$AF_1000G <- as.numeric(output$AF_1000G)
unite(output, VAR_ID, CHROM, POS, REF, ALT, sep = "_", remove = F) %>% arrange(VAR_ID)
#Make VAR_ID, arrange by VAR_ID
}
if (skip_processing) {
load(file = "ACMG_1000G")
} else {
# Import 1000G data for all ACMG
ACMG.1000g <- NULL
header <- c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", as.character(map$sample))
for (gene in ACMG.panel) {
#print(sprintf("[%d/%d] %s",which(gene==ACMG.panel),length(ACMG.panel),gene))
ACMG.1000g <- rbind(ACMG.1000g,import.file.1000g(gene))
}
#Display and remove duplicates
ACMG.1000g <- ACMG.1000g[!duplicated(ACMG.1000g$VAR_ID),]
}
cat(sprintf("Processed 1000 Genomes VCFs: %s x %s (selected rows/columns):", nrow(ACMG.1000g), ncol(ACMG.1000g)))
## Processed 1000 Genomes VCFs: 139335 x 2516 (selected rows/columns):
ACMG.1000g[1:5,1:18] %>% select(-INFO, -QUAL, -FILTER, -FORMAT) %>%
format(scientific = F) %>% pander
| GENE | AF_1000G | VAR_ID | CHROM | POS | ID | REF | ALT |
|---|---|---|---|---|---|---|---|
| APC | 0.000199681 | 5_112043211_A_G | 5 | 112043211 | rs554351451 | A | G |
| APC | 0.000199681 | 5_112043231_G_A | 5 | 112043231 | rs575784409 | G | A |
| APC | 0.005391370 | 5_112043234_C_T | 5 | 112043234 | rs115658307 | C | T |
| APC | 0.000199681 | 5_112043252_G_A | 5 | 112043252 | rs558562104 | G | A |
| APC | 0.008785940 | 5_112043263_C_T | 5 | 112043263 | rs138386816 | C | T |
Table continues below
| HG00096 | HG00097 | HG00099 | HG00100 | HG00101 | HG00102 |
|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 |
| 0 | 0 | 0 | 0 | 0 | 0 |
| 0 | 0 | 0 | 0 | 0 | 0 |
| 0 | 0 | 0 | 0 | 0 | 0 |
| 0 | 0 | 0 | 0 | 0 | 0 |
if (rm_rs1805124)
ACMG.1000g <- ACMG.1000g[ACMG.1000g$ID!="rs1805124",]
import.file.exac <- function(gene) {
file_name <- paste("ExAC",paste(gene,"exac.csv", sep = "_"), sep= "/")
output <- read.csv(file_name, stringsAsFactors = FALSE)
output$Number.of.Hemizygotes <- NULL #Inconsistently present column; removal allows row aggregation
output <- cbind(GENE = gene, output[nchar(paste(output$Alternate,output$Reference))==3,]) %>%
select(GENE, AF_EXAC = Allele.Frequency, CHROM=Chrom, POS=Position,
ID=RSID, REF=Reference, ALT=Alternate, everything()) %>%
unite(VAR_ID, CHROM, POS, REF, ALT, sep = "_", remove = F) %>% arrange(VAR_ID)
tags <- list("African","Latino","East.Asian","European","South.Asian")
output$Allele.Count.European <- output$Allele.Count.European..Non.Finnish. + output$Allele.Count.Finnish
output$Allele.Number.European <- output$Allele.Number.European..Non.Finnish. + output$Allele.Number.Finnish
exac_af <- output[,sprintf("Allele.Count.%s", tags)] / output[,sprintf("Allele.Number.%s", tags)]
colnames(exac_af) <- sprintf("AF_EXAC_%s", c("AFR","AMR","EAS","EUR","SAS"))
output <- cbind(output, exac_af) %>%
select(GENE, AF_EXAC, AF_EXAC_AFR, AF_EXAC_AMR, AF_EXAC_EAS, AF_EXAC_EUR, AF_EXAC_SAS, everything())
}
# Import ExAC data for all ACMG
ACMG.exac <- NULL
for (gene in ACMG.panel) {
#print(sprintf("[%d/%d] %s",which(gene==ACMG.panel),length(ACMG.panel),gene))
ACMG.exac <- rbind(ACMG.exac,import.file.exac(gene))
}
#Display and remove duplicates
#ACMG.exac[duplicated(ACMG.exac$VAR_ID),1:8]
ACMG.exac <- ACMG.exac[!duplicated(ACMG.exac$VAR_ID),]
cat(sprintf("Processed ExAC VCFs: %s x %s (selected rows/columns):", nrow(ACMG.exac), ncol(ACMG.exac)))
## Processed ExAC VCFs: 58873 x 45 (selected rows/columns):
ACMG.exac[1:5,1:13] %>% format(scientific = F) %>% pander
| GENE | AF_EXAC | AF_EXAC_AFR | AF_EXAC_AMR | AF_EXAC_EAS | AF_EXAC_EUR |
|---|---|---|---|---|---|
| APC | 0.00008130 | 0.00000000 | 0.0000000 | 0 | 0.0000000 |
| APC | 0.00008131 | 0.00000000 | 0.0000000 | 0 | 0.0000000 |
| APC | 0.11120000 | 0.07978723 | 0.1021505 | 0 | 0.1063298 |
| APC | 0.00008131 | 0.00000000 | 0.0000000 | 0 | 0.0000000 |
| APC | 0.00008134 | 0.00000000 | 0.0000000 | 0 | 0.0000000 |
Table continues below
| AF_EXAC_SAS | VAR_ID | CHROM | POS | ID | REF | ALT |
|---|---|---|---|---|---|---|
| 0.0001313370 | 5_112043365_G_C | 5 | 112043365 | . | G | C |
| 0.0001313025 | 5_112043382_A_G | 5 | 112043382 | . | A | G |
| 0.1184659837 | 5_112043384_T_G | 5 | 112043384 | rs78429131 | T | G |
| 0.0001313025 | 5_112043392_C_T | 5 | 112043392 | . | C | T |
| 0.0001313025 | 5_112043412_C_G | 5 | 112043412 | . | C | G |
if (rm_rs1805124)
ACMG.exac <- ACMG.exac[ACMG.exac$ID!="rs1805124",]
merge_clinvar_1000g <- function() {
inter <- intersect(clinvar$VAR_ID[clinvar$INTERP], ACMG.1000g$VAR_ID)
clinvar_merged <- clinvar[(clinvar$VAR_ID %in% inter),] %>% arrange(VAR_ID)
ACMG_merged <- ACMG.1000g[ACMG.1000g$VAR_ID %in% inter,] %>% arrange(VAR_ID)
front_cols <- 1:(grep("HG00096",colnames(ACMG.1000g))-1)
cbind(ACMG_merged[,c("GENE","AF_1000G")], clinvar_merged,
ACMG_merged[,-front_cols])
}
merged_1000g <- merge_clinvar_1000g()
inter <- intersect(clinvar$VAR_ID[clinvar$INTERP], ACMG.exac$VAR_ID)
merged_exac <- cbind(clinvar[(clinvar$VAR_ID %in% inter),] %>% arrange(VAR_ID),
ACMG.exac %>% select(VAR_ID, contains("AF_"), GENE) %>%
filter(VAR_ID %in% inter) %>% arrange(VAR_ID) %>% select(-VAR_ID)
) %>% select(VAR_ID, GENE, AF_EXAC, contains("AF_"), everything())
#count up all pathogenic ClinVar in ACMG regions
is.acmg <- function(row) {
genes <- which(row$CHROM == download_output$chrom)
sapply(genes, function(gene) {
between(row$POS, download_output[gene,]$start, download_output[gene,]$end)
}) %>% any
}
cat("Breakdown of ClinVar Variants")
## Breakdown of ClinVar Variants
data.frame(Subset_ClinVar = c("Total ClinVar","LP/P-ClinVar","LP/P-ClinVar & ACMG",
"LP/P-ClinVar & ACMG & ExAC","LP/P-ClinVar & ACMG & 1000 Genomes"),
Number_of_Variants = c(nrow(clinvar),
sum(clinvar$INTERP),
sum(apply(clinvar[clinvar$INTERP,], 1, is.acmg)),
nrow(merged_exac),
nrow(merged_1000g))) %>% pander
| Subset_ClinVar | Number_of_Variants |
|---|---|
| Total ClinVar | 126349 |
| LP/P-ClinVar | 33033 |
| LP/P-ClinVar & ACMG | 6252 |
| LP/P-ClinVar & ACMG & ExAC | 826 |
| LP/P-ClinVar & ACMG & 1000 Genomes | 122 |
cat("Breakdown of ACMG-1000 Genomes Variants")
## Breakdown of ACMG-1000 Genomes Variants
data.frame(Subset_1000_Genomes = c("Total 1000_Genomes & ACMG", "1000_Genomes & ACMG & ClinVar",
"1000_Genomes & ACMG & LP/P-ClinVar"),
Number_of_Variants = c(nrow(ACMG.1000g),
intersect(clinvar$VAR_ID, ACMG.1000g$VAR_ID) %>% length,
nrow(merged_1000g))) %>% pander
| Subset_1000_Genomes | Number_of_Variants |
|---|---|
| Total 1000_Genomes & ACMG | 139335 |
| 1000_Genomes & ACMG & ClinVar | 4891 |
| 1000_Genomes & ACMG & LP/P-ClinVar | 122 |
cat("Breakdown of ACMG-ExAC Variants")
## Breakdown of ACMG-ExAC Variants
data.frame(Subset_ExAC = c("Total ExAC & ACMG","ExAC & ACMG & ClinVar","ExAC & ACMG & LP/P-ClinVar"),
Number_of_Variants = c(nrow(ACMG.exac),
intersect(clinvar$VAR_ID, ACMG.exac$VAR_ID) %>% length,
nrow(merged_exac))) %>% pander
| Subset_ExAC | Number_of_Variants |
|---|---|
| Total ExAC & ACMG | 58873 |
| ExAC & ACMG & ClinVar | 10043 |
| ExAC & ACMG & LP/P-ClinVar | 826 |
clinvar_query.txt contains all results matched by the search query: “(APC[GENE] OR MYH11[GENE]… OR WT1[GENE]) AND (clinsig_pathogenic[prop] OR clinsig_likely_pathogenic[prop])” from the ClinVar website. The exact query is saved in /Supplementary_Files/query_input.txt
This presents another way of collecting data from ClinVar.
Intermediate step: convert hg38 locations to hg19 using the Batch Coordinate Conversion tool (liftOver) from UCSC Genome Browser Utilities.
clinvar_query <- read.table(file = "Supplementary_Files/clinvar_query_2016-10-25.txt", sep = "\t", header = F, skip = 1, stringsAsFactors = F)
colnames(clinvar_query) <- c("Name", "Gene(s)", "Condition(s)", "Frequency", "Clinical significance (Last reviewed)","Review status", "Chromosome","Location","Assembly","VariationID","AlleleID(s)")
#cat(sprintf("Initial count: %s variants", nrow(clinvar_query)))
clinvar_count <- rep(0,7)
clinvar_count[1] <- nrow(clinvar_query)
clinvar_query <- clinvar_query[grepl(".>.",clinvar_query$Name),]
#cat(sprintf("After filtering for substitutions (N>N'): %s variants", nrow(clinvar_query)))
clinvar_count[2] <- nrow(clinvar_query)
clinvar_query <- clinvar_query[!grepl(" - ", clinvar_query$Location) &
!grepl("|",clinvar_query$Chromosome, fixed = T) &
!(clinvar_query$Location == ""),]
#cat(sprintf("After filtering for coupling/bad-locations: %s variants (FINAL)", nrow(clinvar_query)))
clinvar_count[3] <- nrow(clinvar_query)
clinvar_query <- clinvar_query %>%
mutate(Name = sub(".*(.>.).*","\\1", clinvar_query$Name)) %>%
mutate(Location = as.integer(Location)) %>%
separate(Name, into = c("Alternate","Reference"), sep = ">")
#liftOver from http://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/liftOverMerge
#input data from http://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/
#paste(paste0("chr",clinvar_query$Chromosome), clinvar_query$Location, clinvar_query$Location+1) %>%
# write.table(file = "Supplementary_Files/cvquery_location.bed", quote = F, row.names = F, col.names = F)
#system("chmod +x ../Tools/liftOver")
#system("../Tools/liftOver Supplementary_Files/cvquery_hg38.bed ../Tools/hg38ToHg19.over.chain.gz Supplementary_Files/cvquery_hg19.bed err.log")
cvquery_hg19 <- read.table(file = "Supplementary_Files/cvquery_hg19.bed", sep = "\t", header = F, stringsAsFactors = F)
clinvar_query <- clinvar_query %>%
mutate(Location = cvquery_hg19$V2) %>%
unite(col = "VAR_ID", Chromosome, Location, Reference, Alternate, sep = "_", remove = F)
#cat(sprintf("Overlap with ClinVar VCF: %s variants", sum(clinvar_query$VAR_ID %in% clinvar$VAR_ID)))
clinvar_count[4] <- sum(clinvar_query$VAR_ID %in% clinvar$VAR_ID)
#cat(sprintf("Overlap with LP/P ClinVar VCF: %s variants", sum(clinvar_query$VAR_ID %in% clinvar$VAR_ID[clinvar$INTERP])))
clinvar_count[5] <- sum(clinvar_query$VAR_ID %in% clinvar$VAR_ID[clinvar$INTERP])
#cat(sprintf("Overlap with LP/P ClinVar VCF AND ExAC: %s variants", sum(clinvar_query$VAR_ID %in% merged_exac$VAR_ID)))
clinvar_count[6] <- sum(clinvar_query$VAR_ID %in% merged_exac$VAR_ID)
#merged_exac[merged_exac$VAR_ID %in% clinvar_query$VAR_ID, c(1:3,11,15)][1:3,]
#cat(sprintf("Overlap with LP/P ClinVar VCF AND 1000 Genomes: %s variants", sum(clinvar_query$VAR_ID %in% merged_1000g$VAR_ID)))
clinvar_count[7] <- sum(clinvar_query$VAR_ID %in% merged_1000g$VAR_ID)
#merged_1000g[merged_1000g$VAR_ID %in% clinvar_query$VAR_ID, c(3,1:2,6,10)][1:3,]
clinvar_count[8] <- sum((clinvar_query$VAR_ID %in% merged_exac$VAR_ID) & (clinvar_query$VAR_ID %in% merged_1000g$VAR_ID))
clinvar_query[(clinvar_query$VAR_ID %in% merged_exac$VAR_ID) & (clinvar_query$VAR_ID %in% merged_1000g$VAR_ID),] -> temp
cat(sprintf("ClinVar Query Results Table (substitutions only): %s x %s (selected rows/columns)",
nrow(clinvar_query), ncol(clinvar_query)))
## ClinVar Query Results Table (substitutions only): 6714 x 13 (selected rows/columns)
filter(clinvar_query,
(clinvar_query$VAR_ID %in% merged_exac$VAR_ID) &
(clinvar_query$VAR_ID %in% merged_1000g$VAR_ID)) %>% select(c(1,4,5,6)) %>%
mutate(`Condition(s)` = `Condition(s)` %>% strsplit("|", fixed = T) %>% sapply("[",1)) %>%
mutate(Frequency = Frequency %>% strsplit(", ") %>% sapply("[",1) ) %>%
mutate(`Gene(s)` = `Gene(s)` %>% strsplit("|", fixed = T) %>% sapply("[",1) ) %>% pander
| VAR_ID | Gene(s) | Condition(s) | Frequency |
|---|---|---|---|
| X_100652891_C_G | GLA | Fabry disease | GMAF:0.00050(G) |
| 11_47374186_C_G | MYBPC3 | Primary familial hypertrophic cardiomyopathy | GMAF:0.00020(G) |
| 11_47355233_C_G | MYBPC3 | Familial hypertrophic cardiomyopathy 4 | GMAF:0.00020(G) |
| 11_47364162_C_G | MYBPC3 | Familial hypertrophic cardiomyopathy 4 | GMAF:0.00020(G) |
| 14_23886482_G_C | MYH7 | not specified | GMAF:0.00020(C) |
| 14_23893148_C_G | MYH7 | Primary dilated cardiomyopathy | GO-ESP:0.00046(G) |
| 1_17355075_A_T | SDHB | Gastrointestinal stromal tumor | GMAF:0.00120(T) |
| 1_17380507_G_C | SDHB | Cowden syndrome 2 | GO-ESP:0.01323(C) |
cat("Breakdown of ClinVar Query Results Table: ")
## Breakdown of ClinVar Query Results Table:
data.frame(Subset = c("Initial Count","Filter Substitutions (N>N')","Filter Coupling/Bad-Locations","In ClinVar VCF","In LP/P-ClinVar VCF","^ & ACMG & ExAC","^ & ACMG & 1000 Genomes", "^ & ACMG & ExAC & 1000 Genomes"), Number_of_Variants = clinvar_count) %>% pander
| Subset | Number_of_Variants |
|---|---|
| Initial Count | 12525 |
| Filter Substitutions (N>N’) | 6732 |
| Filter Coupling/Bad-Locations | 6714 |
| In ClinVar VCF | 509 |
| In LP/P-ClinVar VCF | 503 |
| ^ & ACMG & ExAC | 49 |
| ^ & ACMG & 1000 Genomes | 9 |
| ^ & ACMG & ExAC & 1000 Genomes | 8 |
cat("Note the 12-fold reduction after merging the online query results with the VCF.")
## Note the 12-fold reduction after merging the online query results with the VCF.
### For plotting population level data:
prettyprint <- function(values, sd, title, xlabel, ylabel, ylimit) {
if (missing(sd)) sd <- TRUE
if (missing(title)) title <- NULL
if (missing(xlabel)) xlabel <- "Population"
if (missing(ylabel)) ylabel <- NULL
if (missing(ylimit)) ylimit <- NULL
colnames(values) <- c("Mean","SD")
values$Population <- factor(pop.levels, levels = pop.levels)
values$Superpopulation <- factor(super[pop.levels], levels = super.levels)
plot.pop <- ggplot(values, aes(x=Population, y=Mean, fill = Superpopulation)) +
geom_bar(stat = "identity") + ggtitle(title) + xlab(xlabel) + ylab(ylabel) +
theme_minimal() + theme(axis.text.x = element_text(angle = -45, hjust = 0.4))
if (sd) {
if (min(values$Mean - values$SD)<0)
plot.pop <- plot.pop + geom_errorbar(aes(
ymin = pmax(0,values$Mean - values$SD),
ymax = values$Mean + values$SD, width = 0.5))
else
plot.pop <- plot.pop + geom_errorbar(aes(ymin = values$Mean - values$SD,
ymax = values$Mean + values$SD, width = 0.5))
} else {values$SD = 0}
if (length(ylimit)==2)
plot.pop <- plot.pop + ylim(ylimit[1],ylimit[2])
else
plot.pop <- plot.pop + ylim(0, 1.1*max(values$Mean + values$SD))
plot.pop
}
plot_af_distrib <- function() {
af_distrib <- data.frame(Index = 2:max(nrow(merged_1000g),nrow(merged_exac))-1,
AF_1000G = sort(merged_1000g$AF_1000G[merged_1000g$AF_1000G != max(merged_1000g$AF_1000G)],
decreasing = T) %>% c(rep(NA,nrow(merged_exac)-nrow(merged_1000g))),
AF_EXAC = sort(merged_exac$AF_EXAC[merged_exac$AF_EXAC != max(merged_exac$AF_EXAC)],
decreasing = T)) %>%
gather(Dataset, Allele_Frequency, AF_1000G, AF_EXAC) %>%
filter(!is.na(Allele_Frequency))
ggplot(aes(x = Allele_Frequency, color = Dataset), data = af_distrib) +
geom_density() + facet_grid(Dataset ~ .) + xlab("Allele Frequency") + ylab("Density") +
theme(legend.position="none")
}
plot_af_distrib()
#Test of Poissonness
x = table(merged_1000g$AF_1000G)
k = 1:length(x)-1
poissondata = data.frame(k=k, poissonness = as.vector(log(x)+lfactorial(k)))
The distribution of allele frequencies is approximately Poisson, with “Poissonness plot” correlation = 0.99. The Poissonness plot (Hoaglin 1980) is defined as the plot of \(log(x_k) + log(k!)\) vs. \(k\), as shown below:
ggplot(aes(x=k,y=poissonness), data = poissondata) + geom_point() + ylab("log(x_k) + log(k!)") + ggtitle("Poissonness Plot")
Each individual has \(n\) non-reference sites, which can be found by counting. The mean number is computed for each population.
Ex: the genotype of 3 variants in 3 people looks like this:
example <- ACMG.1000g[c(88,95,96),14:16]
rownames(example) <- c("Variant 1", "Variant 2","Variant 3")
example %>% pander
| HG00097 | HG00099 | HG00100 | |
|---|---|---|---|
| Variant 1 | 0 | 2 | 1 |
| Variant 2 | 0 | 0 | 1 |
| Variant 3 | 0 | 0 | 1 |
Count the number of non-reference sites per individual:
colSums(example>0) %>% pander
| HG00097 | HG00099 | HG00100 |
|---|---|---|
| 0 | 1 | 3 |
cat(sprintf("Mean = %s", mean(colSums(example>0)) %>% signif(3)))
## Mean = 1.33
### 1000 Genomes
front_cols <- 1:(grep("HG00096",colnames(ACMG.1000g))-1)
sapply(pop.levels, function(pop) {
#Counts the number of non-reference sites in a gene
temp <- colSums(ACMG.1000g[,c(front_cols,map$pop)==pop]>0)
c(mean(temp), sd(temp))
}) %>% t %>% tbl_df -> values #Number of non-reference sites across the different populations
colnames(values) <- c("Mean","SD")
values$Population <- factor(pop.levels, levels = pop.levels)
values$Superpopulation <- factor(super[pop.levels], levels = super.levels)
prettyprint(values, title = "ACMG-56: Mean in 1000 Genomes", sd = T, ylimit = NULL,
xlabel = "Population", ylabel = "Mean No. of Non-Reference Sites")
Note: the error bars denote standard deviation, not standard error.
The mean number of non-reference sites is \(E(V)\), where \(V = \sum_{i=1}^n v_i\) is the number of non-reference sites at all variant positions \(v_1\) through \(v_n\).
At each variant site, the probability of having at least 1 non-reference allele is \(P(v_i) = P(v_{i,a} \cup v_{i,b})\), where \(a\) and \(b\) indicate the 1st and 2nd allele at each site.
If the two alleles are independent, \(P(v_{i,a} \cup v_{i,b})\) = \(1-(1-P(v_{i,a}))(1-P(v_{i,b})) = 1-(1-AF(v_i))^2\)
If all variants are independent, \(E(V) = \sum_{i=1}^n 1-(1-AF(v_i))^2\) for any set of allele frequencies.
Ex: the allele frequencies of 3 variants across the 5 superpopulations looks like this:
example <- rbind(c(0.1,0.2,0,0,0.3),c(0.2,0,0.3,0,0.1)) %>% as.data.frame
rownames(example) <- c("Variant 1", "Variant 2")
colnames(example) <- super.levels
example %>% pander
| AFR | AMR | EAS | EUR | SAS | |
|---|---|---|---|---|---|
| Variant 1 | 0.1 | 0.2 | 0 | 0 | 0.3 |
| Variant 2 | 0.2 | 0 | 0.3 | 0 | 0.1 |
The probability of having at least 1 non-reference site at each variant - (0|1) (1|0) or (1|1) is given by \(1-(1-AF)^2\). Note that this is approximately \(2*AF\) when \(AF\) is small:
as.data.frame(1-(1-example)^2) %>% pander
| AFR | AMR | EAS | EUR | SAS | |
|---|---|---|---|---|---|
| Variant 1 | 0.19 | 0.36 | 0 | 0 | 0.51 |
| Variant 2 | 0.36 | 0 | 0.51 | 0 | 0.19 |
By linearity of expectation, the expected (mean) number of non-reference sites is \(\sum E(V_i) = \sum (columns)\).
colSums(1-(1-example)^2) %>% pander
| AFR | AMR | EAS | EUR | SAS |
|---|---|---|---|---|
| 0.55 | 0.36 | 0.51 | 0 | 0.7 |
### ExAC
#Each element is the probability that at least 1 of the 2 alleles are non-reference.
exac_prob <- 1-(1-ACMG.exac[,sprintf("AF_EXAC_%s",super.levels)])^2
exac_values <- data.frame(exac_prob %>% colSums(na.rm = T), super.levels)
colnames(exac_values) = c("values","Superpopulation")
ggplot(exac_values, aes(x = Superpopulation, y=values, fill = Superpopulation)) +
geom_bar(stat = "identity") + theme_minimal() + ggtitle("ACMG-56: Mean in ExAC") +
xlab("Population") + ylab("Mean No. of Non-Reference Sites") +
ylim(0,1.1*max(exac_values$values))
This is the same procedure as above, but performed only on the subset of variants that are pathogenic.
front_cols <- 1:(grep("HG00096",colnames(merged_1000g))-1)
### 1000 Genomes
sapply(pop.levels, function(pop) {
#Counts the number of non-reference sites in a gene
temp <- colSums(merged_1000g[,c(front_cols,map$pop)==pop]>0)
c(mean(temp), sd(temp))
}) %>% t %>% tbl_df -> values #Number of non-reference sites across the different populations
colnames(values) <- c("Mean","SD")
values$Population <- factor(pop.levels, levels = pop.levels)
values$Superpopulation <- factor(super[pop.levels], levels = super.levels)
prettyprint(values, title = "ACMG-56 Pathogenic: Mean in 1000 Genomes", sd = T, ylimit = NULL,
xlabel = "Population", ylabel = "Mean No. of Non-Reference Sites")
### ExAC
#Each element is the probability that at least 1 of the 2 alleles are non-reference.
exac_prob <- (1-(1-merged_exac[,sprintf("AF_EXAC_%s",super.levels)])^2)
exac_values <- data.frame(exac_prob %>% colSums(na.rm = T), super.levels)
colnames(exac_values) = c("values","Superpopulation")
ggplot(exac_values, aes(x = Superpopulation, y=values, fill = Superpopulation)) +
geom_bar(stat = "identity") + ggtitle("ACMG-56 Pathogenic: Mean in ExAC") +
xlab("Population") + ylab("Mean No. of Non-Reference Sites") + theme_minimal() +
ylim(0,1.1*max(exac_values$values))
We can count up the fraction of individuals with 1+ non-reference site(s) in each population. This is the fraction of individuals who would receive a positive genetic test result in at least 1 of the ACMG-56 genes.
Ex: the genotype of 3 variants in 3 people looks like this:
example <- ACMG.1000g[c(88,95,96),14:16]
rownames(example) <- c("Variant 1", "Variant 2","Variant 3")
example %>% pander
| HG00097 | HG00099 | HG00100 | |
|---|---|---|---|
| Variant 1 | 0 | 2 | 1 |
| Variant 2 | 0 | 0 | 1 |
| Variant 3 | 0 | 0 | 1 |
Count each individual as having a non-reference site (1) or having only reference sites (0):
(1*(colSums(example>0)>0)) %>% pander
| HG00097 | HG00099 | HG00100 |
|---|---|---|
| 0 | 1 | 1 |
cat(sprintf("Mean = %s", mean(1*(colSums(example>0)>0)) %>% signif(3)))
## Mean = 0.667
front_cols <- 1:(grep("HG00096",colnames(merged_1000g))-1)
### 1000 Genomes
sapply(pop.levels, function(pop) {
#Counts the number of non-reference sites in a gene
keep = length(front_cols)+which(map$pop == pop)
temp <- colSums(merged_1000g[,keep])>0
c(mean(temp),sd(temp))
}) %>% t %>% tbl_df -> values #Number of non-reference sites across the different populations
colnames(values) <- c("Mean","SD")
values$Population <- factor(pop.levels, levels = pop.levels)
values$Superpopulation <- factor(super[pop.levels], levels = super.levels)
prettyprint(values, title = "ACMG-56 Pathogenic: 1000 Genomes Fraction", sd = F, ylimit = NULL,
xlabel = "Population", ylabel = "Fraction with at least 1 non-reference site")
The probability of having at least 1 non-reference site is \(P(X)\), where \(X\) indicates a non-reference site at any variant position \(v_1\) through \(v_n\).
Recall that \(P(v_i) = P(v_{i,a} \cup v_{i,b}) = 1-(1-AF(v))^2\) when alleles are independent.
If all alleles are independent, \(P(X) = P(\bigcup_{i=1}^n v_i) = 1-\prod_{i=1}^n (1-AF(v_i))^2\)
Ex: the allele frequencies of 3 variants across the 5 superpopulations looks like this:
example <- rbind(c(0.1,0.2,0,0,0.3),c(0.2,0,0.3,0,0.1)) %>% as.data.frame
rownames(example) <- c("Variant 1", "Variant 2")
colnames(example) <- super.levels
example %>% pander
| AFR | AMR | EAS | EUR | SAS | |
|---|---|---|---|---|---|
| Variant 1 | 0.1 | 0.2 | 0 | 0 | 0.3 |
| Variant 2 | 0.2 | 0 | 0.3 | 0 | 0.1 |
The probability of having at least 1 non-reference site at each variant - (0|1) (1|0) or (1|1) is given by \(1-(1-AF)^2\).
Note that this is approximately \(2*AF\) when \(AF\) is small:
as.data.frame(1-(1-example)^2) %>% pander
| AFR | AMR | EAS | EUR | SAS | |
|---|---|---|---|---|---|
| Variant 1 | 0.19 | 0.36 | 0 | 0 | 0.51 |
| Variant 2 | 0.36 | 0 | 0.51 | 0 | 0.19 |
The expected (mean) number of non-reference sites is given by \(1-\prod (1-AF)^2\).
apply(example, 2, function(x) 1-prod((1-x)^2)) %>% pander
| AFR | AMR | EAS | EUR | SAS |
|---|---|---|---|---|
| 0.4816 | 0.36 | 0.51 | 0 | 0.6031 |
### ExAC
#Each element is the probability that at least 1 of the 2 alleles are non-reference.
exac_prob <- (1-(1-merged_exac[,sprintf("AF_EXAC_%s",super.levels)])^2)
exac_values <- data.frame(1-apply(1-exac_prob, 2, prod, na.rm = T), super.levels)
colnames(exac_values) = c("values","Superpopulation")
ggplot(exac_values, aes(x = Superpopulation, y=values, fill = Superpopulation)) +
geom_bar(stat = "identity") + ggtitle("ACMG-56 Pathogenic: ExAC Fraction") +
xlab("Population") + ylab("Fraction with at least 1 non-reference site") + theme_minimal() +
ylim(0,1.05*max(exac_values$values))
F-statistic/T-statistic: probability that the different groups are sampled from distributions with the same mean.
These plots are from 4(a) - 1000 Genomes Fraction with 1+ Non-Reference Site, but can be replicated for plots 2(ab) and 3(ab) as well.
#Calculating test statistics (F-values)
Fcalc <- function(values, pop) {
if (missing(pop)) {
groups <- super[pop.levels]
} else {
groups <- ifelse(super[pop.levels]==pop,pop,"Other")
}
data <- data.frame(y = values, group = factor(groups))
color_map <- c("red","gold3","springgreen3","deepskyblue","violet","white") %>%
setNames(c("AFR","AMR","EAS","EUR","SAS","Other"))
out <- lm(y ~ group, data) %>% anova
plot(y ~ group, data, xlab = NULL, ylab = NULL,
col = color_map[sort(unique(groups))],
main = paste("F-statistic:",out$`Pr(>F)`[1] %>% signif(3)))
out
}
par(mfrow=c(2,3))
F_values <- c(Fcalc(values$Mean)$`Pr(>F)`[1] %>% setNames("Overall"),
sapply(super.levels, function(pop) {
Fcalc(values$Mean, pop)$`Pr(>F)`[1]
}))
par(mfrow=c(1,1), mar=c(5, 4, 4, 2)+0.1)
#F_values
### 1000 Genomes
front_cols <- 1:(grep("HG00096",colnames(merged_1000g))-1)
sapply(super.levels, function(superpop) {
(merged_1000g[,length(front_cols)+which(map$super_pop == superpop)] %>%
rowSums %>% setNames(merged_1000g$VAR_ID)) /
(2*sum(map$super_pop==superpop))
}) -> af_1000g_by_ancestry
ord <- order(apply(af_1000g_by_ancestry,1,sum), decreasing = T)[1:8]
ranked_id <- row.names(af_1000g_by_ancestry)[ord]
ranked_var <- data.frame(Var_ID = factor(ranked_id, levels = ranked_id),
af_1000g_by_ancestry[ord,]) %>% gather(Ancestry, Subdivided_Allele_Frequencies, -Var_ID)
ggplot(ranked_var, aes(x = Var_ID, y = Subdivided_Allele_Frequencies, fill = Ancestry)) +
geom_bar(stat='identity', color = 'black', width = 0.7) +
ggtitle("High Frequency Variants in 1000 Genomes") + coord_flip()
### ExAC
af_exac_by_ancestry <- merged_exac[,sprintf("AF_EXAC_%s",super.levels)]
colnames(af_exac_by_ancestry) <- super.levels
ord <- order(apply(af_exac_by_ancestry,1,sum), decreasing = T)[1:8]
ranked_id <- merged_exac$VAR_ID[ord]
ranked_var <- data.frame(Var_ID = factor(ranked_id, levels = ranked_id),
af_exac_by_ancestry[ord,]) %>%
gather(Ancestry, Subdivided_Allele_Frequencies, 2:6)
ggplot(ranked_var, aes(x = Var_ID, y = Subdivided_Allele_Frequencies, fill = Ancestry)) +
geom_bar(stat='identity', color = 'black', width = 0.7) +
ggtitle("High Frequency Variants in ExAC") + coord_flip()
Let \(V_x\) be the event that an individual has 1 or more variant related to disease \(x\),
and \(D_x\) be the event that the individual is later diagnosed with disease \(x\).
In this case, we can define the following probabilities:
1. Prevalence = \(P(D_x)\)
2. Allele Frequency = \(P(V_x)\)
3. Allelic Heterogeneity = \(P(V_x|D_x)\)
4. Penetrance = \(P(D_x|V_x)\)
By Bayes’ Rule, the penetrance of a variant related to disease \(x\) may be defined as: \[P(D_x|V_x) = \frac{P(D_x)*P(V_x|D_x)}{P(V_x)} = \frac{Prevalence*Allelic.Heteogeneity}{Allele.Frequency}\]
To compute penetrance estimates for each of the diseases related to the ACMG-56 genes, we will use the prevalence data we collected into Literature_Prevalence_Estimates.csv, allele frequency data from 1000 Genomes and ExAC, and a broad range of values for allelic heterogeneity.
Data Collection: 1. Similar disease subtypes were grouped together (e.g., the 8 different types of familial hypertrophic cardiomyopathy), resulting in 30 disease categories across 56 genes.
2. The search query “[disease name] prevalence” was used to find articles using Google Scholar.
3. Prevalence estimates were recorded along with URL, journal, region, publication year, sample size, first author, population subset (if applicable), date accessed, and potential issues. Preference was given to studies with PubMed IDs, more citations, and larger sample sizes.
Prevalence was recorded as reported: either a point estimate or a range. Values of varying quality were collected across all diseases.
colclass <- rep("character",16)
colclass[c(5:6,10,14)] <- "numeric"
ACMG_Lit <- read.csv(file = "Literature_Prevalence_Estimates.csv", header = TRUE, stringsAsFactors = F, na.strings = "\\N", colClasses = colclass)
disease_abbrev <- sapply(ACMG_Lit$Disease, function(d) {
strsplit(d," ") %>% sapply("[",ifelse(grepl("Familial",d),2,1)) %>% substr(1,9) %>% paste0("...")
}) %>% setNames(NULL)
disease_abbrev_longer <- sapply(ACMG_Lit$Disease, function(d) {
out <- substr(d,1,17)
split_out <- strsplit(out," ") %>% unlist
if (nchar(split_out[length(split_out)]) <= 2)
out <- split_out[-length(split_out)] %>% paste(collapse = " ")
out %>% paste0("...")
}) %>% setNames(NULL)
combined <- (ACMG_Lit$Inverse.Prevalence.1+ACMG_Lit$Inverse.Prevalence.2)/2
mean.prev <- ACMG_Lit$Inverse.Prevalence.1 %>% setNames(disease_abbrev_longer)
mean.prev[!is.na(combined)] <- combined[!is.na(combined)]
cat(sprintf("Table of Literature-Based Estimates of Disease Prevalence %s x %s (selected rows/columns):", nrow(ACMG_Lit), ncol(ACMG_Lit)))
## Table of Literature-Based Estimates of Disease Prevalence 30 x 16 (selected rows/columns):
ACMG_Lit[c(4,5,8,14),] %>% remove_rownames %>%
select(Gene, Disease, Disease_MIM, Tags, Inverse.Prevalence.1, Inverse.Prevalence.2, year,first.author,citations) %>% pander
| Gene | Disease | Disease_MIM | Tags |
|---|---|---|---|
| BRCA1;BRCA2 | Breast-ovarian cancer familial | 604370;612555 | breast;ovarian |
| SCN5A | Brugada syndrome | 601144 | brugada |
| COL3A1 | Ehlers-Danlos syndrome | 130050 | ehler;danlos |
| TP53 | Li-Fraumeni syndrome | 151623 | fraumeni |
Table continues below
| Inverse.Prevalence.1 | Inverse.Prevalence.2 | year | first.author | citations |
|---|---|---|---|---|
| 104 | NA | 2013 | NA | NA |
| 10000 | 2000 | 2006 | Antzelevitch | 11 |
| 20000 | NA | 2010 | Malfait | 116 |
| 20000 | 5000 | 1999 | Schneider | 47 |
data.frame(Disease = disease_abbrev_longer,
Inverse_Prevalence = mean.prev %>% setNames(NULL)) %>%
ggplot(aes(x = factor(Disease, levels = Disease[order(Inverse_Prevalence)]), y = Inverse_Prevalence)) +
geom_point(stat = 'identity') + coord_flip() + xlab("Disease") +
scale_y_continuous(trans='log10', breaks = 10^(0:6),
labels = sapply(0:6, function(x) paste0("1",paste0(rep("0",x), collapse = "")))) +
theme(axis.text.y=element_text(size=7)) +
ggtitle("Distribution of Inverse Prevalences (log-scale)") + ylab("Inverse Prevalence")
We define AF(disease) as the probability of having at least 1 variant associated with the disease.
The frequencies across the relevant variants can be aggregated in two ways:
(1) By direct counting, from genotype data in 1000 Genomes.
(2) AF(disease) = \(1-\prod_{variant}(1-AF_{variant})\), from population data in ExAC (assumes independence).
if (!("AF_1000G_AFR" %in% colnames(merged_1000g))) {
front_cols <- 1:(grep("HG00096",colnames(merged_1000g))-1)
sapply(super.levels, function(superpop){
(merged_1000g[,length(front_cols)+which(map$super_pop == superpop)] %>% rowSums)/(2*2504)
}) -> pop_af
colnames(pop_af) <- sprintf("AF_1000G_%s",super.levels)
merged_1000g <- data.frame(merged_1000g, pop_af) %>%
select(GENE, AF_1000G, VAR_ID, CHROM, POS, ID, REF, ALT, CLNALLE, CLNSIG,
AF_1000G_AFR, AF_1000G_AMR, AF_1000G_EAS, AF_1000G_EUR, AF_1000G_SAS, everything())
}
front_cols <- 1:(grep("HG00096",colnames(merged_1000g))-1)
getAlleleFreq <- function(input, ind, method, dataset) {
if (method == "MIM") {
search_list <- ACMG_Lit$Disease_MIM
search_in <- "CLNDSDBID"
}
if (method == "Tags") {
search_list <- ACMG_Lit$Tags
search_in <- "CLNDBN"
}
if (method == "Gene") {
search_list <- ACMG_Lit$Gene
search_in <- "GENE"
}
output <- sapply(search_list, function(item.vec) {
item.vec.split <- strsplit(item.vec,";") %>% unlist
loc <- rep(FALSE, nrow(input)) #loc = locations of all the "hits"
for(item in item.vec.split)
loc <- loc | grepl(item,input[,search_in], ignore.case = T)
hits <- sum(loc)
sapply(c(dataset,super.levels), function(superpop) {
if (ind) {
find = sprintf("AF_%s",toupper(dataset))
if (superpop!=dataset)
find = paste(find, superpop, sep = "_")
# Aggregation by calculation + ind assumption
freq <- input[loc,find] %>% unlist %>% as.numeric #vector of all allele frequencies
freq <- 1-(1-freq)^2 #probability of having a non-reference site at EITHER allele.
final <- 1-prod(1-freq[!is.na(freq)])
} else {
# Aggregation by counting
front_cols <- 1:(grep("HG00096",colnames(input))-1)
if (superpop == dataset)
catch <- -front_cols
else
catch <- length(front_cols)+which(map$super_pop==superpop)
final <- mean(colSums( input[loc, catch] )>0)
}
final
}) -> final_list
col_head <- paste("AF",toupper(dataset),sep = "_")
c(final_list, hits) %>% setNames(c(col_head, sprintf("%s_%s", col_head, super.levels), "Hits"))
}) %>% t %>% tbl_df
data.frame(search_list, output)
}
# Other methods MIM and Tags
# Do NOT use MIM if CLNDSDBID is missing (older VCFs)
freq_1000g.count.gene <- getAlleleFreq(merged_1000g, ind = F, method = "Gene", dataset = "1000G")
freq_1000g.calc.gene <- getAlleleFreq(merged_1000g, ind = T, method = "Gene", dataset = "1000G")
freq_exac.calc.gene <- getAlleleFreq(merged_exac, ind = T, method = "Gene", dataset = "EXAC")
allele.freq <- data.frame(
COUNT_1000G = freq_1000g.count.gene$AF_1000G,
CALC_1000G = freq_1000g.calc.gene$AF_1000G,
CALC_EXAC = freq_exac.calc.gene$AF_EXAC
)
row.names(allele.freq) <- ACMG_Lit$Disease
#cor(allele.freq) %>% as.data.frame %>% pander
par(mfrow=c(1,2), mar=c(5, 5.5, 4, 2)+0.1)
ggplot(aes(x = CALC_1000G, y = CALC_EXAC), data = allele.freq) +
geom_point(stat = "identity", col = 'red') + ggtitle("Scatterplot: ExAC v. 1000 Genomes") +
geom_text_repel(aes(label = disease_abbrev_longer), size = 3) +
scale_x_continuous(limits = c(10^-5, max(allele.freq[,"CALC_EXAC"]))*5,
trans='log10', breaks = 10^-(0:3)) +
scale_y_continuous(limits = c(10^-5, max(allele.freq[,"CALC_EXAC"]))*5,
trans='log10', breaks = 10^-(0:3)) +
xlab("Allele Frequency (1000 Genomes)") + ylab("Allele Frequency (ExAC)") +
geom_abline(slope = 1, intercept = 0)
Ratio_1000G (red, top) computes AF(calculation in 1000 Genomes) / AF(counting in 1000 Genomes).
Ratio_ExAC (blue, bottom) computes AF(calculation in ExAC) / AF(counting in 1000 Genomes).
ratio_diff <- function() {
ratio <- data.frame(Means = allele.freq$COUNT_1000G,
Ratio_1000G = (pmax(allele.freq$CALC_1000G, allele.freq$COUNT_1000G)/
pmin(allele.freq$CALC_1000G, allele.freq$COUNT_1000G)),
Ratio_ExAC = (pmax(allele.freq$CALC_EXAC, allele.freq$CALC_1000G)/
pmin(allele.freq$CALC_EXAC, allele.freq$CALC_1000G))) %>%
mutate(Disease = factor(disease_abbrev_longer,
levels = disease_abbrev_longer[order(pmax(Ratio_ExAC,Ratio_1000G))])) %>%
gather(Dataset, Ratio, Ratio_1000G, Ratio_ExAC) %>%
filter(is.finite(Ratio))
ggplot(aes(x=Disease, y=Ratio, color = Dataset), data = ratio) + coord_flip() +
geom_point(stat = 'identity') + facet_wrap("Dataset", ncol = 1) +
ggtitle("Ratios of Allele Frequencies from Different Methods") +
scale_y_continuous(breaks = seq(0,100,1)) + theme(legend.position="none")
}
ratio_diff()
Sampling 1000 variants from all variants in 1000 Genomes to test deviations from independence assumptions.
Repeat for 1000 trials and plot the distribution of disease-level allele frequencies (1000 points per disease).
plot_ind_test <- function() {
load("Supplementary_Files/ind_test.RData")
#set.seed(123)
#do.call("rbind",lapply(1:1000, function(x) {
# select_rows <- sample(nrow(ACMG.1000g),1000)
# data.frame(DISEASE = disease_abbrev_longer,
# COUNT = getAlleleFreq(ACMG.1000g[select_rows,],
# ind = F, method = "Gene", dataset = "1000G")$AF_1000G,
# CALC = getAlleleFreq(ACMG.1000g[select_rows,],
# ind = T, method = "Gene", dataset = "1000G")$AF_1000G)
#})) -> ind_test_data
#plot(ind_test_data %>% ggplot(aes(x=CALC-COUNT)) + geom_histogram(bins = 100) +
# xlab("Calculation - Counting") + ylab("Histogram Counts") + facet_wrap("DISEASE", ncol = 3))
plot(sapply(levels(ind_test_data$DISEASE), function(d) {
ind_test_data %>% filter(DISEASE == d) %>% mutate(DIFF = CALC - COUNT) %>%
select(DIFF) %>% unlist
}) %>% data.frame %>% gather(Disease, Data) %>%
ggplot(aes(x = Disease, y = Data)) + geom_boxplot() + coord_flip() +
ylab("Allele Frequency Difference: Calculation - Counting") +
ggtitle("Differences in AF Methods: by Disease"))
cat("30 diseases x 1000 points = 30,000 points. This plot has been downsampled 10x and contains 3,000 points.")
plot(ggplot(aes(x = COUNT, y = CALC), data = ind_test_data[runif(nrow(ind_test_data)) < 0.1,]) +
geom_point() +
scale_x_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
xlab("Allele Frequency (From Counting)") + ylab("Allele Frequency (From Calculation)") +
ggtitle("Testing Independence with Random Sampling") +
geom_abline(slope = 1, intercept = 0, color = 'blue') +
geom_text(aes(label = replace(DISEASE,which(abs(COUNT - CALC) < 0.4), NA)),
check_overlap = F, hjust = 1, na.rm = T, size = 2.5)
)
ind_test_data %>% filter(COUNT>0)
}
plot_ind_out <- plot_ind_test()
## 30 diseases x 1000 points = 30,000 points. This plot has been downsampled 10x and contains 3,000 points.
Pearson correlation: 0.99
Mean ratio (Calculation/Counting): 1.07
The left end of the boxplot indicates P(V|D) = 0.001,
the bold line in the middle indicates P(V|D) = 0.25,
the right end of the boxplot indicates P(V|D) = 0.5.
get_penetrance <- function(ah_low, ah_high, dataset) {
# Map of disease name to disease tags
if (dataset == "1000 Genomes")
named.freqs <- allele.freq$COUNT_1000G %>% setNames(disease_abbrev_longer)
if (dataset == "ExAC")
named.freqs <- allele.freq$CALC_EXAC %>% setNames(disease_abbrev_longer)
named.prev <- 1/mean.prev %>% setNames(disease_abbrev_longer)
# Repeats allow for correct quartile calculations
#point estimate set to arithmetic mean
allelic.het <- c(ah_low, ah_low, mean(c(ah_low, ah_high)) %>% signif(3), ah_high, ah_high)
#Functions to transform data points with disease_af = 0
set_to_na <- function(row) { replace(row, is.infinite(row),NA) %>% pmin(1)}
# Matrix of penetrance values for allelic het range, capped at 1
penetrance <- as.vector(allelic.het %o% (named.prev/named.freqs)) %>% set_to_na()
order(matrix(penetrance,nrow = 5) %>% colSums, decreasing = T) -> ord
# replicate each element n times to create labels
penetrance_data <- data.frame("Penetrance" = penetrance, "Disease" =
factor(sapply(disease_abbrev_longer, function(x) rep(x,length(allelic.het))) %>% as.vector,
levels = disease_abbrev_longer[ord]) )
colormap <- rep('black', length(disease_abbrev))
colormap[1:(mean(is.na(penetrance_data$Penetrance))*length(colormap))] <- 'gray60'
colormap <- rev(colormap)
plot(
ggplot(aes(x=Disease, y=Penetrance), data = penetrance_data) +
geom_boxplot(position = 'identity', coef = 0, na.rm = T) + coord_flip() +
ggtitle(dataset) + theme(axis.text.y = element_text(color = colormap))
#annotate("text", x = which(colormap == 'red'), y = 0,
# label = "No Allele Frequency Data", hjust = 0, size = 3)
)
penetrance_data
}
pen_1000g <- get_penetrance(ah_low = 0.001, ah_high = 0.5, dataset = "1000 Genomes")
pen_exac <- get_penetrance(ah_low = 0.001, ah_high = 0.5, dataset = "ExAC")
Note 1: the grayed-out empty lines at the top all indicate no allele frequency (disease_AF) data.
Note 2: For breast-ovarian cancer, mean theoretical penetrance > 1. This is because the assumed allelic heterogeneity (0.25) is greater than is possible, given the empirical prevalence and allele frequencies.
doubles <- ACMG_Lit[!is.na(combined),c("Disease","Inverse.Prevalence.1","Inverse.Prevalence.2")]
doubles$Mean.Inverse.Prevalence <- apply(doubles[,-1], 1, function(row) 1/mean(1/row))
data.frame(Disease = doubles$Disease, Prevalence_Ratio = round(doubles$Inverse.Prevalence.1/doubles$Inverse.Prevalence.2,1)) %>% arrange(Prevalence_Ratio) %>% mutate(Prevalence_Ratio = sprintf("%.1f",Prevalence_Ratio)) %>% pander
| Disease | Prevalence_Ratio |
|---|---|
| Retinoblastoma | 1.3 |
| Marfan’s syndrome | 1.5 |
| Lynch syndrome | 3.0 |
| Adenomatous polyposis coli | 3.3 |
| Li-Fraumeni syndrome | 4.0 |
| Paragangliomas | 4.0 |
| Pilomatrixoma | 4.0 |
| Brugada syndrome | 5.0 |
| Peutz-Jeghers syndrome | 12.0 |
| Left ventricular noncompaction | 18.6 |
The left end of the boxplot indicates P(D) = upper value,
the bold line in the middle indicates P(D) = mean(values),
the right end of the boxplot indicates P(D) = lower value.
prev.range <- 1/doubles[,c(2,2,4,3,3)] %>% as.matrix
#Set this to 1 rather than 0.02
penetrance_out <- function(allelic.het) {
set_to_na <- function(row) { replace(row, is.infinite(row),NA) %>% pmin(1)}
penetrance <- (allelic.het*prev.range/allele.freq$COUNT_1000G[!is.na(combined)]) %>%
apply(2, set_to_na)
rownames(penetrance) <- doubles$Disease
ord <- penetrance %>% rowSums %>% order(decreasing = T)
penetrance <- penetrance[ord,]
colnames(penetrance) <- NULL
# replicate each element n times to create labels
penetrance_data <- data.frame("Penetrance" = penetrance %>% t %>% as.vector,
"Disease" = factor(sapply(row.names(penetrance), function(x) rep(x,5)) %>% as.vector,
levels = row.names(penetrance))
)
colormap <- rep('black', nrow(doubles))
colormap[1:(mean(is.na(penetrance_data$Penetrance))*length(colormap))] <- 'gray60'
colormap <- rev(colormap)
plot(ggplot(aes(x=Disease, y=Penetrance), data = penetrance_data) +
geom_boxplot(position = 'identity', coef = 0, na.rm = T) + coord_flip() +
ggtitle(sprintf("1000 Genomes: P(V|D) = %s", allelic.het)) +
theme(axis.text.y = element_text(color = colormap))
)
penetrance_data
}
pen_ah_0.1 <- penetrance_out(0.1)
pen_ah_1.0 <- penetrance_out(1.0)
This can only be computed in the 9 cases where a prevalence range was given (rather than a point estimate)
and the disease-level allele frequency is known (in this plot: all of them except Puetz-Jeghers).
The left end of the boxplot indicates P(D) AND P(V|D) = lower value,
the bold line in the middle indicates P(D) AND P(V|D) = mean(values),
the right end of the boxplot indicates P(D) AND P(V|D) = upper value.
get_penetrance <- function(ah_low, ah_high, dataset, range) {
# Map of disease name to disease tags
if (dataset == "1000 Genomes")
named.freqs <- allele.freq$COUNT_1000G %>% setNames(disease_abbrev_longer)
if (dataset == "ExAC")
named.freqs <- allele.freq$CALC_EXAC %>% setNames(disease_abbrev_longer)
allelic.het <- c(ah_low, ah_low, mean(c(ah_low, ah_high)) %>% signif(3), ah_high, ah_high)
inv.prev_1 <- 1/ACMG_Lit$Inverse.Prevalence.1 %>% setNames(disease_abbrev_longer)
inv.prev_2 <- 1/ACMG_Lit$Inverse.Prevalence.2 %>% setNames(disease_abbrev_longer)
#Adjust ranges of point values to 5, with arithmetic mean = original value
#take unique prev_1 values and compute a = 2k/(1+r) = lower value of a 5x range
inv.prev_1[is.na(inv.prev_2)] <- inv.prev_1[is.na(inv.prev_2)]*2/(1+range)
#take NA prev_2 values and set as 5a.
inv.prev_2[is.na(inv.prev_2)] <- inv.prev_1[is.na(inv.prev_2)]*5
prev_final <- data.frame(inv.prev_1, inv.prev_1, (inv.prev_1+inv.prev_2)/2,
inv.prev_2, inv.prev_2)
cap_at_1 <- function(row) {pmin(row,1)}
set_to_na <- function(row) { replace(row, is.infinite(row),NA) %>% pmin(1)}
penetrance <- apply(sweep(prev_final,MARGIN=2,allelic.het,`*`) / named.freqs, 1, set_to_na) %>% as.data.frame %>% unlist
# Matrix of penetrance values for allelic het range, capped at 1
order(matrix(penetrance,nrow = 5) %>% colSums, decreasing = T) -> ord
# replicate each element n times to create labels
penetrance_data <- data.frame("Penetrance" = penetrance, "Disease" =
factor(sapply(disease_abbrev_longer, function(x) rep(x,length(allelic.het))) %>% as.vector,
levels = disease_abbrev_longer[ord]) )
colormap <- rep('black', length(disease_abbrev))
colormap[1:(mean(is.na(penetrance_data$Penetrance))*length(colormap))] <- 'gray60'
colormap <- rev(colormap)
plot(ggplot(aes(x=Disease, y=Penetrance), data = penetrance_data) +
geom_boxplot(position = 'identity', coef = 0, na.rm = T) + coord_flip() +
ggtitle(dataset) + theme(axis.text.y = element_text(color = colormap))
)
penetrance_data
}
pen_1000g_max <- get_penetrance(ah_low = 0.001, ah_high = 0.5, dataset = "1000 Genomes", range = 5)
pen_exac_max <- get_penetrance(ah_low = 0.001, ah_high = 0.5, dataset = "ExAC", range = 5)
Note: Prevalence ranges of 5x were assumed for all point estimates of prevalence.
For example: a point estimate of 0.3 would be given the range [0.1, 0.5].
ancestry_penetrance <- function(ah_low, ah_high, dataset, range) {
allelic.het <- c(ah_low, ah_low, mean(c(ah_low, ah_high)) %>% signif(3), ah_high, ah_high)
sapply(c("Total",super.levels), function(superpop){
# Map of disease name to disease tags
if (dataset == "1000 Genomes") {
find <- "AF_1000G"
named.freqs <- freq_1000g.count.gene
}
if (dataset == "ExAC") {
find <- "AF_EXAC"
named.freqs <- freq_exac.calc.gene
}
if (superpop != "Total")
find <- paste(find, superpop, sep = "_")
named.freqs <- named.freqs[,find] %>% unlist %>% setNames(disease_abbrev_longer)
inv.prev_1 <- 1/ACMG_Lit$Inverse.Prevalence.1 %>% setNames(disease_abbrev_longer)
inv.prev_2 <- 1/ACMG_Lit$Inverse.Prevalence.2 %>% setNames(disease_abbrev_longer)
#Adjust ranges of point values to 5, with arithmetic mean = original value
inv.prev_1[is.na(inv.prev_2)] <- inv.prev_1[is.na(inv.prev_2)]*2/(1+range)
inv.prev_2[is.na(inv.prev_2)] <- inv.prev_1[is.na(inv.prev_2)]*5
prev_final <- data.frame(inv.prev_1, inv.prev_1, (inv.prev_1+inv.prev_2)/2,
inv.prev_2, inv.prev_2)
# Matrix of penetrance values for allelic het range, capped at 1
cap_at_1 <- function(row) {pmin(row,1)}
set_to_na <- function(row) { replace(row, is.infinite(row),NA) %>% pmin(1)}
(sweep(prev_final,MARGIN=2,allelic.het,`*`) / named.freqs) %>%
apply(1, set_to_na) %>% as.data.frame %>% unlist
}) %>% as.data.frame -> penetrance_init
order(matrix(penetrance_init$Total, nrow = 5) %>% colSums, decreasing = T) -> ord
penetrance_data <- data.frame(penetrance_init,
"Disease" = factor(sapply(disease_abbrev_longer,
function(x) rep(x,length(allelic.het))) %>% as.vector,
levels = disease_abbrev_longer[ord]) ) %>%
gather(Subset, Penetrance, -Disease)
plot(ggplot(aes(x=Disease, y=Penetrance), data = penetrance_data) +
geom_boxplot(position = 'identity', coef = 0, na.rm = T) + coord_flip() +
facet_wrap(~Subset, ncol = 2) + ggtitle(sprintf("Barplot: Penetrance by Ancestry (%s)", dataset)) +
theme(axis.text.y=element_text(size=6), axis.text.x = element_text(angle = -20, hjust = 0.4))
)
plot(ggplot(aes(x=Disease, y = Subset), data = penetrance_data[c(F,F,F,F,T),]) + coord_flip() +
geom_tile(aes(fill = Penetrance), color = 'white') + xlab("Disease") + ylab("Ancestry") +
scale_fill_gradient(low='white',high = 'darkblue', na.value = "grey50",
breaks=c(0,0.25,0.5,0.75,1), labels=c("0","0.25","0.50","0.75","1.00"), limits =c(0,1)) +
ggtitle(sprintf("Heatmap: Max Penetrance by Ancestry (%s)", dataset)) +
theme_minimal() + theme(axis.ticks = element_blank()) +
annotate("segment", y=c(0.5,5.5,6.5), yend=c(0.5,5.5,6.5),
x=0.5, xend = length(disease_abbrev)+0.5) +
annotate("segment", y=0.5, yend=6.5,
x=c(0.5,length(disease_abbrev)+0.5),
xend = c(0.5,length(disease_abbrev)+0.5))
)
cat("Dark gray boxes are NA: no associated variants discovered in that ancestral population.")
}
ancestry_penetrance(ah_low = 0.001, ah_high = 0.5, dataset = "1000 Genomes", range = 5)
## Dark gray boxes are NA: no associated variants discovered in that ancestral population.
ancestry_penetrance(ah_low = 0.001, ah_high = 0.5, dataset = "ExAC", range = 5)
## Dark gray boxes are NA: no associated variants discovered in that ancestral population.
par(mfrow=c(1,2))
pen_1000g$Penetrance[c(F,F,F,F,T)] %>% ecdf %>%
plot(ylab = "Fraction with max theoretical penetrance < P", xlab = "P",
main = "CDF: Penetrance ~ P(V|D)", ylim = c(0,1.02), pch = 19)
pen_exac$Penetrance[c(F,F,F,F,T)] %>% ecdf %>% plot(col = 'red', add = T, pch = 1)
legend("bottomright", legend = c("ExAC","1000G"), col = c("red","black"), pch = c(1,19))
pen_ah_0.1$Penetrance[c(F,F,F,F,T)] %>% ecdf %>%
plot(xlab = "P", ylab = "", main = "CDF: Penetrance ~ P(D)", ylim = c(0,1.02), pch = 19)
pen_ah_1.0$Penetrance[c(F,F,F,F,T)] %>% ecdf %>% plot(col = 'red', add = T, pch = 1)
legend("bottomright", legend = c("P(V|D) = 0.1","P(V|D) = 1.0"), col = c("black","red"), pch = c(19,1))
pen_1000g_max$Penetrance[c(F,F,F,F,T)] %>% ecdf %>%
plot(ylab = "Fraction with max theoretical penetrance < P", xlab = "P",
main = "CDF: Penetrance ~ P(V|D) AND P(D)", ylim = c(0,1.02), pch = 19)
pen_exac_max$Penetrance[c(F,F,F,F,T)] %>% ecdf %>% plot(col = 'red', add = T, pch = 1)
legend("bottomright", legend = c("ExAC","1000G"), col = c("red","black"), pch = c(1,19))
par(mfrow=c(1,1), mar=c(5, 4, 4, 2)+0.1)
penetrance_data <- data.frame(Disease = disease_abbrev_longer,
Penetrance_1000_Genomes = pen_1000g_max$Penetrance[c(F,F,T,F,F)],
Penetrance_ExAC = pen_exac_max$Penetrance[c(F,F,T,F,F)]) %>%
filter(!is.na(Penetrance_1000_Genomes) & !is.na(Penetrance_ExAC))
ggplot(aes(x = Penetrance_1000_Genomes, y = Penetrance_ExAC), data = penetrance_data) +
geom_point(stat = "identity", col = 'red') +
ggtitle("Penetrance Means: ExAC v. 1000 Genomes (log-log scale)") +
geom_text_repel(aes(label = Disease), size = 3) +
scale_x_continuous(trans='log10', breaks = 10^-(0:6)) +
scale_y_continuous(trans='log10', breaks = 10^-(0:6)) +
geom_smooth(method = "lm", formula=y~x-1) +
xlab("Penetrance (1000 Genomes)") + ylab("Penetrance (ExAC)")
lm_compare <- lm(Penetrance_1000_Genomes ~ Penetrance_ExAC, penetrance_data)
The Pearson correlation is 0.76.
Max penetrance values computed using 1000 Genomes are 0.944-fold larger than those computed using ExAC.